
**************************************************************************************
**************************************************************************************
********************************** CLEAN DATA ****************************************
**************************************************************************************
**************************************************************************************

* NOTE: All file paths and data names that need to be inserted are identified with <>
* This should be run after the SAS code 

clear
set more off
capture log close 

global dta = "<insert path to raw data>" 
global output= "<insert path to outputs>" 

log using "${output}\log\create_samples.log", replace

di "Time $S_DATE $S_TIME"

**************************************************************************************

use "${dta}\master_file.dta", clear /* SAS file converted to Stata file */

*** Restrictions and organization of variables
keep if startpay>=mdy(1,1,1997) & startpay<mdy(1,1,2010) 

#delimit ;
label variable age_filing "Age at Filing"; global l_age_filing: variable label age_filing;
label variable male "Male"; global l_male: variable label male;
label variable hearings "ALJ Hearing"; global l_hearings: variable label hearings;
label variable black "Black"; global l_black: variable label black;
#delimit cr

*** DI program variables
gen annual_pia=pia*12
label variable annual_pia "Annualized PIA"

*** Death variables
gen d_year1= (dodbest~=. & dodbest>=(startpay         ) & dodbest<(startpay+365.25  ))
gen d_year2= (dodbest~=. & dodbest>=(startpay+365.25  ) & dodbest<(startpay+365.25*2))
gen d_year3= (dodbest~=. & dodbest>=(startpay+365.25*2) & dodbest<(startpay+365.25*3))
gen d_year4= (dodbest~=. & dodbest>=(startpay+365.25*3) & dodbest<(startpay+365.25*4))
forvalues i=4/10	{
	gen d_year1`i' = (dodbest~=. & dodbest>=(startpay) & dodbest<(startpay+365.25*`i')) / `i'
	label variable d_year1`i' "Avg mort years 1-`i'"
	}
	
*** Earnings 
* setting extreme values to 99th percentile
foreach earnvar of varlist ep1-ep4 {
	su `earnvar', detail
	scalar p99=r(p99)
	di "`=p99'"
	replace `earnvar'=`=p99' if `earnvar'>`=p99'
}
gen e1234=(ep1+ep2+ep3+ep4)/(4*12)
label variable e1234 "Monthly earnings"

*** Primary disability
* recode empty codes of dig to missing
replace dig1="" if dig1=="0000" | dig1=="9999" 
replace dig2="" if dig2=="0000" | dig2=="9999" 
replace dig3="" if dig3=="0000" | dig3=="9999" 
replace dig4="" if dig4=="0000" | dig4=="9999" 
replace dig5="" if dig5=="0000" | dig5=="9999" 

* go through the values of dig1-dig5 until you find a non missing one
gen diagnosis=""
	replace diagnosis=dig1 if missing(diagnosis) & dig1~=""
	replace diagnosis=dig2 if missing(diagnosis) & dig2~=""
	replace diagnosis=dig3 if missing(diagnosis) & dig3~=""
	replace diagnosis=dig4 if missing(diagnosis) & dig4~=""
	replace diagnosis=dig5 if missing(diagnosis) & dig5~=""
destring diagnosis, gen(diag_num)

* following the codebook classification scheme
#delimit ;
gen impairment=25;
	replace impairment=1 if inrange(diag_num,2960,2969) |
							inrange(diag_num,3110,3119) ;
	replace impairment=2 if inrange(diag_num,2950,2959) |
							inrange(diag_num,2980,2989) ;
	replace impairment=3 if					
							inrange(diag_num,3000,3019) |
							inrange(diag_num,3080,3099) ;
	replace impairment=4 if						
							inrange(diag_num,2900,2949) |
							inrange(diag_num,2990,2999) |
							inrange(diag_num,3030,3079) |
							inrange(diag_num,3100,3109) |
							inrange(diag_num,3120,3129) |
							inrange(diag_num,3138,3169) |
							diag_num==3195              ;
	replace impairment=5 if inrange(diag_num,3170,3194) |
							inrange(diag_num,3196,3199) ;
	replace impairment=6 if inrange(diag_num,7221,7249) ;
	replace impairment=7 if inrange(diag_num,7100,7200) |
							inrange(diag_num,7250,7399) ;
	replace impairment=8 if						
							inrange(diag_num,0110,0119) |
							inrange(diag_num,0450,0459) |
							inrange(diag_num,0930,1359) |
							inrange(diag_num,1380,1389) ;
	replace impairment=9 if						
							inrange(diag_num,0070,0079) |
							inrange(diag_num,0201,0449) |
							inrange(diag_num,0540,0559) |
							inrange(diag_num,0780,0789) |
							inrange(diag_num,1360,1369) ;
	replace impairment=10 if inrange(diag_num,1400,2399);
	replace impairment=11 if						
							inrange(diag_num,2400,2479) |
							inrange(diag_num,2500,2559) |
							inrange(diag_num,2630,2799) ;
	replace impairment=12 if inrange(diag_num,2800,2899);
	replace impairment=13 if						
							inrange(diag_num,3610,3699) |
							inrange(diag_num,3780,3789) ;
	replace impairment=14 if inrange(diag_num,3890,3899);
	replace impairment=15 if inrange(diag_num,7840,7849);
	replace impairment=16 if						
							inrange(diag_num,3200,3419) |
							inrange(diag_num,3430,3599) |
							inrange(diag_num,3860,3889) ;
	replace impairment=17 if						
							inrange(diag_num,3420,3429) |
							inrange(diag_num,3750,3759) |
							inrange(diag_num,3900,4599) ;
	replace impairment=18 if						
							inrange(diag_num,4600,4869) |
							inrange(diag_num,4910,5199) |
							inrange(diag_num,7690,7699) ;
	replace impairment=19 if inrange(diag_num,5200,5799);
	replace impairment=20 if inrange(diag_num,5800,6299);
	replace impairment=21 if inrange(diag_num,6900,7099);
	replace impairment=22 if inrange(diag_num,7400,7599);
	replace impairment=23 if inrange(diag_num,8000,9599);
	replace impairment=24 if						
							inrange(diag_num,0000,0069) |
							inrange(diag_num,0680,0689) |
							inrange(diag_num,2480,2499) |
							inrange(diag_num,2580,2589) |
							diag_num==3130              |
							inrange(diag_num,4880,4889) |
							inrange(diag_num,6300,6889) |							          
							inrange(diag_num,7600,7689) |
							inrange(diag_num,7740,7839) |
							inrange(diag_num,7850,7959) |
							inrange(diag_num,9840,9849) ;
							
	label define imp_label
			1 "Major Affective"
			2 "Schizophrenia/Psychoses"
			3 "Anxiety/neurotic"
			4 "Other mental"
			5 "Retardation"
			6 "Back"
			7 "Musculoskeletal"
			8 "Infectious/parasitic"
			9 "HIV/AIDS"
			10 "Neoplasms"
			11 "Endocrine/nutritional"
			12 "Blood"
			13 "Visual"
			14 "Hearing"
			15 "Speech"
			16 "Nervous"
			17 "Circulatory"
			18 "Respiratory"
			19 "Digestive"
			20 "Genitourinary"
			21 "Skin/subcutaneous"
			22 "Congenital"
			23 "Injuries"
			24 "Other"
			25 "Unknown codes";			
	label values impairment imp_label;

tab impairment;

* grouped categories;
gen impairment2=9;
	replace impairment2=1 if inrange(impairment,1,5);
	replace impairment2=2 if impairment==6 | impairment==7;
	replace impairment2=3 if impairment==10;
	replace impairment2=4 if impairment==17;
	label define imp_label2
		1 "Mental"
		2 "Musculoskeletal"
		3 "Neoplasms"
		4 "Circulatory"
		9 "Other";
	label values impairment2 imp_label2;
	
* impairment variables;
gen mental=(impairment2==1); label variable mental "Mental Impairment"; global l_mental: variable label mental;
gen musc=(impairment2==2); label variable musc "Musculoskeletal"; global l_musc: variable label musculo;
gen circulatory=(impairment3==4); label variable circulatory "Circulatory"; global l_circulatory: variable label circulatory;
gen neoplasms=(impairment2==3); label variable neoplasms "Neoplasms"; global l_neoplasms: variable label neoplasms;
gen other=(impairment3==9); label variable musculo "Other"; global l_other: variable label other;
#delimit cr

save "${dta}\master_file2.dta", replace

****************************************************************
*** Create datasets for lower, family & upper bendpoints      
****************************************************************	

*** Main samples

local sample main

use "${dta}\master_file2.dta", clear

#delimit cr
keep 
ime_bp1 ime_bp2 ime_fm 
pia fmax diff_start family_ben ssi startpay_year 
age_filing male black hearings 
mental musc circulatory neoplasms other
d_year1 d_year2 d_year3 d_year4 d_year14 d_year15 d_year16 d_year17 d_year18 d_year19 d_year110
e1234;
#delimit cr

drop if ssi==0

* Lower bend point
preserve 
rename ime_bp1 ime
gen imeabs=abs(ime)
save "${dta}\lower_`sample'_sample.dta", replace
restore

* Family bend point
preserve 
keep if family_ben==1 & (diff_start==-1 | diff_start==0) /*dependent payments from start*/
rename ime_fm ime
gen imeabs=abs(ime)
save "${dta}\fm_`sample'_sample.dta", replace
restore

* Upper bend point
preserve 
rename ime_bp2 ime
gen imeabs=abs(ime)
save "${dta}\upper_`sample'_sample.dta", replace
restore


*** SSI samples

local sample ssi

use "${dta}\master_file2.dta", clear

#delimit cr
keep 
ime_bp1 ime_bp2 ime_fm 
fmax diff_start family_ben ssi startpay_year 
age_filing male black hearings 
mental musc circulatory neoplasms other
d_year1 d_year2 d_year3 d_year4 d_year14 d_year15 d_year16 d_year17 d_year18 d_year19 d_year110
e1234;
#delimit cr

* Lower bend point
preserve 
rename ime_bp1 ime
gen imeabs=abs(ime)
save "${dta}\lower_`sample'_sample.dta", replace
restore

* Family bend point
preserve 
keep if family_ben==1 & (diff_start==-1 | diff_start==0) /*dependent payments from start*/
rename ime_fm ime
gen imeabs=abs(ime)
save "${dta}\fm_`sample'_sample.dta", replace
restore

* Upper bend point
preserve 
rename ime_bp2 ime
gen imeabs=abs(ime)
save "${dta}\upper_`sample'_sample.dta", replace
restore


*** Placebo - non-dependent sample around family bend point

local sample nondepend

use "${dta}\master_file2.dta", clear

#delimit cr
keep 
ime_bp1 ime_bp2 ime_fm 
fmax diff_start family_ben ssi startpay_year 
age_filing male black hearings 
mental musc circulatory neoplasms other
d_year1 d_year2 d_year3 d_year4 d_year14 d_year15 d_year16 d_year17 d_year18 d_year19 d_year110
e1234;
#delimit cr

* Family bend point
preserve 
keep if family_ben==0 /*no dependent payments*/
rename ime_fm ime
gen imeabs=abs(ime)
save "${dta}\fm_`sample'_sample.dta", replace
restore


*** Placebo - non-beneficiary samples using the Continuous Work History Sample

local sample nonbenef

use "${dta}\pop2005.dta", clear

gen dead4=(ydeath>=2005 & ydeath~=.) 
label variable dead4 "Death within 4 years"
su dead4, det
gen mort=(dead4*100)/4	
su mort, det
gen male=(sex=="1")
gen black=(race=="B")

* 2005 bend points
local bp1_2005 "627"
local bp2_2005 "3779"
local fm_2005 "1728.49"

*** Lower bend point
preserve 
* AIME in relation to bendpoint
gen ime_bp1_nominal = aime - `bp1_2005'
* Convert to 2020 price levels: 1.22 converts 2005 to 2013--1.1 2013 to 2020
gen double ime = ime_bp1_nominal * 1.226289644 * 1.1081	
gen imeabs=abs(ime)
save "${dta}\lower_`sample'_sample.dta", replace
restore

*** Family bend point
preserve 
* AIME in relation to bendpoint
gen ime_fm_nominal = aime - `fm_2005'
* Convert to 2020 price levels: 1.22 converts 2005 to 2013--1.1 2013 to 2020
gen double ime = ime_fm_nominal * 1.226289644 * 1.1081	
gen imeabs=abs(ime)
save "${dta}\fm_`sample'_sample.dta", replace
restore

*** Upper bend point
preserve 
* AIME in relation to bendpoint
gen ime_bp2_nominal = aime - `bp2_2005'
* Convert to 2020 price levels: 1.22 converts 2005 to 2013--1.1 2013 to 2020
gen double ime = ime_bp2_nominal * 1.226289644 * 1.1081	
gen imeabs=abs(ime)
save "${dta}\upper_`sample'_sample.dta", replace
restore

clear all

log close